# Import metadata
metadata_table <- read.csv("/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/LabusMetadata.csv")
table(metadata_table$host_disease) # look at number of IBS vs HC
# Keep only relevant metadata
metadata_table <- metadata_table[, c("Run", "host_disease", "host_subtype", "Host_Age", "host_sex", "host_body_mass_index")]
rownames(metadata_table) <- metadata_table$Run # assign sample names as row names
metadata_table$host_disease <- as.character(metadata_table$host_disease) # host_disease is of class "factor", transform it to characters
metadata_table$host_disease[metadata_table$host_disease == "None"] <- "Healthy" # replace the host_disease "none" to "healthy"
metadata_table$host_disease[metadata_table$host_disease == "Irritable bowel syndrome"] <- "IBS"
saveRDS(metadata_table, "/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/metadata_final.rds")
# Import OTU table (rows: sample names // columns: sequence variants)
seqtable.nochim <- readRDS("/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/ASVtable_final.rds")
dim(seqtable.nochim) # should have 52 samples and 760 ASV
rownames(seqtable.nochim) <- gsub(".fastq.gz","", rownames(seqtable.nochim)) # remove the .fastq.gz extension in sample names
# Import Taxonomic table (rows: sequence variants // columns: Kingdom, Phylum, Class, ...)
taxa <- readRDS("/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/taxa_final.rds")
dim(taxa)
#____________________________________________________________________
# OTU TABLE, TAXA TABLE AND METADATA
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
sample_data(metadata_table),
tax_table(taxa))
#____________________________________________________________________
# PHYLOGENETIC TREE
# Multiple-sequence alignment (know which regions are conserved/different to be able to do the phylogeny)
seqs <- getSequences(seqtable.nochim) # get the sequence variants (the ASVs)
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(seqs), anchor=NA) # by default, anchor = 0.7 which means that 70% of sequences must share a common region to anchor the alignment space.
# Construct a neighbor-joining tree
phang.align <- phyDat(as(alignment, "matrix"), type="DNA") # transform the aligned DNA sequences into a phyDat (phangorn) object
dm <- dist.ml(phang.align) # compute pairwise distance between DNA sequences (with the Jukes-Cantor estimate of the evolutionary distance)
treeNJ <- NJ(dm) # create a neighbor-joining tree estimation based on the distance matrix
fit <- pml(treeNJ, data=phang.align) # get the likelihood of the phylogenetic tree given the sequence alignment (then we'll optimize it)
## negative edges length changed to 0!
# Fit a generalized time-reversible with gamma rate variation (GTR+G+I)
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE, # gamma rate and proportion of variable size get optimized
rearrangement = "stochastic", control = pml.control(trace = 0)) # (trace = 0) don't show output during optimization
# stochastic tree rearrangement
# Add tree to physeq
physeq <- merge_phyloseq(physeq, phy_tree(fitGTR$tree))
# Look at the tree
plot_tree(physeq, color = "host_disease", ladderize="left")
#____________________________________________________________________
# GIVE SURNAMES TO OTUs
# Give surnames to sequence variants & store the sequence variants in "refseq" in the phyloseq object
dna <- DNAStringSet(taxa_names(physeq)) # get the sequence variants (ASVs)
names(dna) <- taxa_names(physeq) # no idea what this does
physeq <- merge_phyloseq(physeq, dna) # store the dna sequences in the refseq of the phyloseq object
taxa_names(physeq) <- paste0("OTU", seq(ntaxa(physeq))) # replace the whole dna sequences in the taxa_names by a surname OTU1, OTU2, etc.
# Save physeq object
saveRDS(physeq, "/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/Rproject_DataAnalysis/physeq.rds")
# Relative abundance for Phylum
phylum.table <- physeq %>%
tax_glom(taxrank = "Phylum") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() # Melt to long format
# Relative abundance for Class
class.table <- physeq %>%
tax_glom(taxrank = "Class") %>% # agglomerate at class level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() # Melt to long format
# Plot Phylum
plot_bar(physeq, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
theme(axis.text.x = element_text(size = 8))+
labs(x = "Samples", y = "Absolute abundance", title = "Labus dataset (2017)")+
ylim(0, 7000)
ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(size = 8, angle = -90))+
labs(x = "Samples", y = "Relative abundance", title = "Labus dataset (2017)")
# Plot Class
plot_bar(physeq, fill = "Class")+ facet_wrap("host_disease", scales="free") +
theme(axis.text.x = element_text(size = 8))+
labs(x = "Samples", y = "Absolute abundance", title = "Labus dataset (2017)")+
ylim(0, 7000)
ggplot(class.table, aes(x = Sample, y = Abundance, fill = Class))+
facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(size = 8, angle = -90))+
labs(x = "Samples", y = "Relative abundance", title = "Labus dataset (2017)")
# Extract abundance of only Bacteroidota and Firmicutes
bacter <- phylum.table[phylum.table[,'Phylum'] == 'Bacteroidota', c('Sample', 'Abundance',
'host_disease', 'host_subtype', 'Phylum')]
bacter <- bacter[order(bacter$Sample),] # order by sample name
firmi <- phylum.table[phylum.table[,'Phylum'] == 'Firmicutes', c('Sample', 'Abundance',
'host_disease', 'host_subtype', 'Phylum')]
firmi <- firmi[order(firmi$Sample),] # order by sample name
# Calculate log2 ratio Firmicutes/Bacteroidota
ratio_FB <- data.table('Sample' = bacter$Sample,
'host_disease' = bacter$host_disease,
'host_subtype' = bacter$host_subtype,
'Bacteroidota' = bacter$Abundance,
'Firmicutes' = firmi$Abundance)
ratio_FB[,"Log_ratio_FB"] <- log2(ratio_FB[,"Firmicutes"] / ratio_FB[,"Bacteroidota"])
# Plot log2 ratio Firmicutes/Bacteroidota
par(mar = c(3, 10, 3, 10))
boxplot(data = ratio_FB, Log_ratio_FB ~ host_disease, ylab = "Log2 Ratio Abundance", xlab = "", main = "Firmicutes to Bacteroidota ratio")
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH PSEUDOCOUNTS
physeq.pseudocts <- physeq
otu_table(physeq.pseudocts)[otu_table(physeq.pseudocts) == 0] <- 0.5
# check the 0 values have been replaced
otu_table(physeq)[1:5,1:5]
otu_table(physeq.pseudocts)[1:5,1:5]
# save the physeq.pseudocts object
saveRDS(physeq.pseudocts, "/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/Rproject_DataAnalysis/physeq_pseudocts.rds")
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.pseudocts
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) ) # divide each count by the total number of counts (per sample)
# check the counts are all relative
otu_table(physeq.pseudocts)[1:5, 1:5]
otu_table(physeq.rel)[1:5, 1:5]
# sanity check
sum(otu_table(physeq.rel) < 0) # see how many negative values are present in the matrix
sum(rowSums(otu_table(physeq.rel)) != 1) # check if there is any row not summing to 1
# save the physeq.rel object
saveRDS(physeq.rel, "/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/Rproject_DataAnalysis/physeq_relative.rds")
#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.pseudocts
physeq.clr <- transform(physeq.clr, "clr")
# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
otu_table(physeq.pseudocts)[1:5, 1:5] # should contain absolute counts
otu_table(physeq.clr)[1:5, 1:5] # should all be relative
# save the physeq.rel object
saveRDS(physeq.clr, "/Users/enigma/Desktop/Munich/Praktikum/Data/Labus_2017/Rproject_DataAnalysis/physeq_clr.rds")
#____________________________________________________________________________________
# Measure distances
Distances <- function(physeq_obj){
set.seed(123) # for unifrac, need to set a seed
glom.UniF <- UniFrac(physeq_obj, weighted=TRUE, normalized=TRUE) # weighted unifrac
glom.ait <- distance(subset_samples(physeq.clr, sample_names(physeq.clr) != 'SRR5245605'), method = 'euclidean') # aitchison
glom.bray <- distance(physeq_obj, method = "bray") # bray-curtis
glom.jac <- distance(physeq_obj, method = "jaccard") # jaccard
glom.can <- distance(physeq_obj, method = "canberra") # canberra
dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray, "Jac" = glom.jac)
return(dist.list)
}
#____________________________________________________________________________________
# Plot 2D ordination
MDS_2D <- function(physeq_obj, ait_dist){
plist <- NULL
plist <- vector("list", length(dist_methods)+1) # save each plot to a list
names(plist) <- dist_methods # save the name of each method
names(plist)[17] <- "aitchison" # save the name of aitchison
# Loop through all distance methods
for(i in dist_methods){
# Calculate distance matrix
#print(i)
set.seed(123) # in case the distance method needs a rooted tree (weighted unifrac)
iDist <- phyloseq::distance(physeq_obj, method=i)
# Calculate ordination
set.seed(123)
iMDS <- ordinate(physeq_obj, "MDS", distance=iDist)
## Make plot
# Don't carry over previous plot (if error, p will be blank)
p <- NULL
# Create plot, store as temporary variable, p
p <- plot_ordination(physeq_obj, iMDS, color="host_disease")
# Add title to each plot
p <- p + ggtitle(paste("MDS using distance method ", i, sep=""))
# Save the graphic to the plot list
plist[[i]] = p
}
# Add aitchison
iMDS <- ordinate(physeq_obj, "MDS", distance=ait_dist)
p <- NULL
p <- plot_ordination(physeq_obj, iMDS, color="host_disease")
p <- p + ggtitle("MDS using distance method Aitchison")
plist[[17]] = p
# Creating a dataframe to plot everything
plot.df = ldply(plist, function(x) x$data)
names(plot.df)[1] <- "distance"
# Plot
p.alldist <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
geom_point(size=5, alpha=0.5) + scale_color_manual(values = c('blue', 'red'))+
facet_wrap(~distance, scales='free')+
theme(strip.text = element_text(size = 40),
legend.text = element_text(size = 20),
axis.text.x = element_text(size = 20),
axis.text.y = element_text(size = 20))+
geom_text_repel(data=plot.df %>% filter(Run == 'SRR5245605'), # Filter data first
aes(label='SRR5245605'), size = 5, nudge_y = -0.1, nudge_x = -0.1)
return(p.alldist)
}
#____________________________________________________________________________________
# Plot 3D ordination
MDS_3D <- function(d, name_dist){
# Reset parameters
mds.3D <- NULL
xyz <- NULL
fig.3D <- NULL
# Reduce distance matrix to 3 dimensions
set.seed(123) # to get the same dimensionality reduction at each run
mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
xyz <- scores(mds.3D, display="sites") # pull out the x y z coordinates
if(name_dist == 'Aitchison'){
fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type='scatter3d', mode='markers',
color=sample_data(subset_samples(physeq.rel, sample_names(physeq.rel) != 'SRR5245605'))$host_disease,
colors = c('blue', 'red')) %>%
layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
}
else{
fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type='scatter3d',
mode='markers', color=sample_data(physeq.rel)$host_disease, colors = c('blue', 'red')) %>%
add_trace(x=xyz['SRR5245605',1], y=xyz['SRR5245605',2], z=xyz['SRR5245605',3],
type='scatter3d', mode='text', text = 'SRR5245605', textfont = list(color = 'blue')) %>%
layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
}
return(fig.3D)
}
# Get the distances
no_glom.dist <- Distances(physeq_obj = physeq.rel)
# Get the 2D ordination plots
no_glom.2D.alldist <- MDS_2D(physeq.rel, no_glom.dist$Ait)
no_glom.2D.alldist + theme(title = element_text(size = 30))
# Get 3D MDS plots
no_glom.3D.UniF <- MDS_3D(no_glom.dist$UniF, 'weighted Unifrac')
no_glom.3D.Ait <- MDS_3D(no_glom.dist$Ait, 'Aitchison')
no_glom.3D.Jac <- MDS_3D(no_glom.dist$Jac, 'Jaccard')
no_glom.3D.Can <- MDS_3D(no_glom.dist$Canb, 'Canberra')
no_glom.3D.UniF
no_glom.3D.Ait
no_glom.3D.Jac
no_glom.3D.Can
Heatmaps <- function(dist_list, fontsize){
# Weighted Unifrac
heatmp.UniF <- pheatmap(as.matrix(dist_list$UniF),
clustering_distance_rows = dist_list$UniF,
clustering_distance_cols = dist_list$UniF,
fontsize = fontsize,
fontsize_col = fontsize-5,
fontsize_row = fontsize-5,
annotation_col = mat_col,
annotation_row = mat_col,
annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
subtype = c('HC' = 'blue',
'IBS-D' = '#CC0033', 'IBS-C' = '#990000', 'IBS-M' = '#FF0099',
'IBS-unspecified' = '#FF6666', 'IBS-A' = '#FF0099')),
cluster_rows = T,
cluster_cols = T,
clustering_method = 'complete', #hierarchical method
main = 'Weighted unifrac distance')
# Aitchison
heatmp.Ait <- pheatmap(as.matrix(dist_list$Ait),
clustering_distance_rows = dist_list$Ait,
clustering_distance_cols = dist_list$Ait,
fontsize = fontsize,
fontsize_col = fontsize-5,
fontsize_row = fontsize-5,
# border_color = NA,
cluster_rows = T,
cluster_cols = T,
clustering_method = "complete", #hierarchical method
annotation_col = mat_col,
annotation_row = mat_col,
annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
subtype = c('HC' = 'blue',
'IBS-D' = '#CC0033', 'IBS-C' = '#990000', 'IBS-M' = '#FF0099',
'IBS-unspecified' = '#FF6666', 'IBS-A' = '#FF0099')),
main = "Aitchison distance")
# Bray-Curtis
heatmp.Bray <- pheatmap(as.matrix(dist_list$Bray),
clustering_distance_rows = dist_list$Bray,
clustering_distance_cols = dist_list$Bray,
fontsize = fontsize,
fontsize_col = fontsize-5,
fontsize_row = fontsize-5,
# border_color = NA,
cluster_rows = T,
cluster_cols = T,
clustering_method = "complete", #hierarchical method
annotation_col = mat_col,
annotation_row = mat_col,
annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
subtype = c('HC' = 'blue',
'IBS-D' = '#CC0033', 'IBS-C' = '#990000', 'IBS-M' = '#FF0099',
'IBS-unspecified' = '#FF6666', 'IBS-A' = '#FF0099')),
main = "Bray-Curtis distance")
# Canberra
heatmp.Canb <- pheatmap(as.matrix(dist_list$Canb),
clustering_distance_rows = dist_list$Canb,
clustering_distance_cols = dist_list$Canb,
fontsize = fontsize,
fontsize_col = fontsize-5,
fontsize_row = fontsize-5,
# border_color = NA,
cluster_rows = T,
cluster_cols = T,
clustering_method = "complete", #hierarchical method
annotation_col = mat_col,
annotation_row = mat_col,
annotation_colors = list(group = c('Healthy' = 'blue', 'IBS' = 'red'),
subtype = c('HC' = 'blue',
'IBS-D' = '#CC0033', 'IBS-C' = '#990000', 'IBS-M' = '#FF0099',
'IBS-unspecified' = '#FF6666', 'IBS-A' = '#FF0099')),
main = "Canberra distance")
return(list("UniF" = heatmp.UniF, "Ait" = heatmp.Ait, "Bray" = heatmp.Bray, "Canb" = heatmp.Canb))
}
# Get the heatmaps
no_glom.dist$Canb <- distance(subset_samples(physeq.rel, sample_names(physeq.rel) != 'SRR5245605'), method = "canberra") # remove the outlier in the canberra distance
no_glom.heatmaps <- Heatmaps(dist_list = no_glom.dist, fontsize = 8)
#___________________________________________________________________
# FIGURE 1A
set.seed(785)
UniF.test = UniFrac(physeq, weighted=FALSE, normalized=T) # unweighted Unifrac dist
UniF.PCoA <- ordinate(physeq, "PCoA", distance = UniF.test) # PCoA on unweighted unifrac dist
plot_ordination(physeq, UniF.PCoA, color="host_disease")
# Plotting the first 3 dimensions of the PCoA
xyz <- as.matrix(UniF.PCoA$vectors[,1:3])
test.plot <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d",
mode="markers", color=sample_data(physeq)$host_disease, colors = c("blue", "red"))%>%
layout(title = "PCoA in 3D with unweighted Unifrac distance")
test.plot
#___________________________________________________________________
# FIGURE 1B
# Hierarchical clustering using average linkage
hc.average <- hclust(UniF.test, method = "average")
dend.average <- as.dendrogram(hc.average)
color_leaf<<-function(n){
if(is.leaf(n)){
# take the current attributes
a=attributes(n)
# deduce the line in the original data, to get the corresponding disease status
line=match(attributes(n)$label, rownames(metadata_table))
disease=metadata_table[line, "host_disease"];
if(disease=="Healthy"){col_disease="blue"};if(disease=="IBS"){col_disease="red"}
#Modification of leaf attribute
attr(n,"nodePar") <- c(a$nodePar, list(cex=1.5, lab.cex=0.6, pch=20, col=col_disease, lab.font=1, lab.cex=0.1))
}
return(n)
}
dend.average <- dendrapply(dend.average, color_leaf)
#Plot
par(mar=c(3,2,3,7), xpd=T, cex = 0.8)
plot(dend.average , main="Hierarchical clustering: average linkage", horiz = T, cex = 0.1)
legend('topleft',
legend = c("Healthy" , "IBS"),
col = c("blue", "red"),
pch = c(20,20), bty = "n", pt.cex = 1.5, cex = 1,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))